Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
import ydata_profiling as pp
pd.set_option('display.max_columns' , 50 )
warnings.filterwarnings("ignore" )
Load the Dataset
file_url = "data/vehicles.csv.zip"
fueleco = pd.read_csv(file_url)
fueleco
0
15.695714
0.0
0.0
0.0
19
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
423.190476
21
0.0
0
0.0
0.0
0.0
0.0
4.0
2.0
Rear-Wheel Drive
...
23.3333
0.0
35.0000
0.0
Two Seaters
1985
-1750
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
1
29.964545
0.0
0.0
0.0
9
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
807.909091
11
0.0
0
0.0
0.0
0.0
0.0
12.0
4.9
Rear-Wheel Drive
...
11.0000
0.0
19.0000
0.0
Two Seaters
1985
-10500
T
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
2
12.207778
0.0
0.0
0.0
23
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
329.148148
27
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
Front-Wheel Drive
...
29.0000
0.0
47.0000
0.0
Subcompact Cars
1985
250
NaN
SIL
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
3
29.964545
0.0
0.0
0.0
10
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
807.909091
11
0.0
0
0.0
0.0
0.0
0.0
8.0
5.2
Rear-Wheel Drive
...
12.2222
0.0
16.6667
0.0
Vans
1985
-10500
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
4
17.347895
0.0
0.0
0.0
17
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
467.736842
19
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
4-Wheel or All-Wheel Drive
...
21.0000
0.0
32.0000
0.0
Compact Cars
1993
-4750
NaN
NaN
T
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
39096
14.982273
0.0
0.0
0.0
19
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
403.954545
22
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
Front-Wheel Drive
...
24.0000
0.0
37.0000
0.0
Compact Cars
1993
-1500
NaN
CLKUP
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
39097
14.330870
0.0
0.0
0.0
20
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
386.391304
23
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
Front-Wheel Drive
...
25.0000
0.0
39.0000
0.0
Compact Cars
1993
-1000
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
39098
15.695714
0.0
0.0
0.0
18
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
423.190476
21
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
4-Wheel or All-Wheel Drive
...
23.0000
0.0
34.0000
0.0
Compact Cars
1993
-1750
NaN
CLKUP
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
39099
15.695714
0.0
0.0
0.0
18
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
423.190476
21
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
4-Wheel or All-Wheel Drive
...
23.0000
0.0
34.0000
0.0
Compact Cars
1993
-1750
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
39100
18.311667
0.0
0.0
0.0
16
0.0
0
0.0
0.0
0.0
0.0
-1
-1
0.0
493.722222
18
0.0
0
0.0
0.0
0.0
0.0
4.0
2.2
4-Wheel or All-Wheel Drive
...
20.0000
0.0
29.0000
0.0
Compact Cars
1993
-5500
NaN
CLKUP
T
NaN
NaN
NaN
NaN
NaN
NaN
NaN
0.0
NaN
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
NaN
0
0
0
39101 rows × 83 columns
fueleco.select_dtypes(exclude= "object" ).mean()
fueleco.select_dtypes(exclude= "object" ).std()
fueleco.select_dtypes(exclude= "object" ).info()
fueleco.select_dtypes(exclude= ["object" , "bool" ]).quantile(
[0 , 0.25 , 0.5 , 0.75 , 1 ]
)
fueleco.select_dtypes(exclude= ["object" , "bool" ]).describe().T
fueleco.describe(include= object )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 60 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 barrels08 39101 non-null float64
1 barrelsA08 39101 non-null float64
2 charge120 39101 non-null float64
3 charge240 39101 non-null float64
4 city08 39101 non-null int64
5 city08U 39101 non-null float64
6 cityA08 39101 non-null int64
7 cityA08U 39101 non-null float64
8 cityCD 39101 non-null float64
9 cityE 39101 non-null float64
10 cityUF 39101 non-null float64
11 co2 39101 non-null int64
12 co2A 39101 non-null int64
13 co2TailpipeAGpm 39101 non-null float64
14 co2TailpipeGpm 39101 non-null float64
15 comb08 39101 non-null int64
16 comb08U 39101 non-null float64
17 combA08 39101 non-null int64
18 combA08U 39101 non-null float64
19 combE 39101 non-null float64
20 combinedCD 39101 non-null float64
21 combinedUF 39101 non-null float64
22 cylinders 38956 non-null float64
23 displ 38958 non-null float64
24 engId 39101 non-null int64
25 feScore 39101 non-null int64
26 fuelCost08 39101 non-null int64
27 fuelCostA08 39101 non-null int64
28 ghgScore 39101 non-null int64
29 ghgScoreA 39101 non-null int64
30 highway08 39101 non-null int64
31 highway08U 39101 non-null float64
32 highwayA08 39101 non-null int64
33 highwayA08U 39101 non-null float64
34 highwayCD 39101 non-null float64
35 highwayE 39101 non-null float64
36 highwayUF 39101 non-null float64
37 hlv 39101 non-null int64
38 hpv 39101 non-null int64
39 id 39101 non-null int64
40 lv2 39101 non-null int64
41 lv4 39101 non-null int64
42 phevBlended 39101 non-null bool
43 pv2 39101 non-null int64
44 pv4 39101 non-null int64
45 range 39101 non-null int64
46 rangeCity 39101 non-null float64
47 rangeCityA 39101 non-null float64
48 rangeHwy 39101 non-null float64
49 rangeHwyA 39101 non-null float64
50 UCity 39101 non-null float64
51 UCityA 39101 non-null float64
52 UHighway 39101 non-null float64
53 UHighwayA 39101 non-null float64
54 year 39101 non-null int64
55 youSaveSpend 39101 non-null int64
56 charge240b 39101 non-null float64
57 phevCity 39101 non-null int64
58 phevHwy 39101 non-null int64
59 phevComb 39101 non-null int64
dtypes: bool(1), float64(32), int64(27)
memory usage: 17.6 MB
count
37912
23431
39101
39101
39101
39101
39101
39090
39101
2335
15047
5816
738
3204
1490
1485
644
8289
53
51
39101
39101
7405
unique
7
545
14
6
134
3816
2
37
34
3
52
1
1
8
4
216
125
46
5
4
195
68
2
top
Front-Wheel Drive
(FFS)
Regular
Regular Gasoline
Chevrolet
F150 Pickup 2WD
N
Automatic 4-spd
Compact Cars
G
CLKUP
T
S
FFV
E85
290
288V Ni-MH
GMX
standard charger
80 amp dual charger
Tue Jan 01 00:00:00 EST 2013
Tue Jan 01 00:00:00 EST 2013
N
freq
13653
8827
25620
26931
3900
214
26400
11045
5661
1356
7809
5816
738
1383
1383
74
117
1255
42
42
34199
29438
5176
Column Types
fueleco.dtypes.value_counts()
fueleco.select_dtypes(include= "bool" )
0
False
1
False
2
False
3
False
4
False
...
...
39096
False
39097
False
39098
False
39099
False
39100
False
39101 rows × 1 columns
fueleco.select_dtypes("int64" ).describe().T
city08
39101.0
18.077799
6.970672
6.0
15.0
17.0
20.0
150.0
cityA08
39101.0
0.569883
4.297124
0.0
0.0
0.0
0.0
145.0
co2
39101.0
72.538989
163.252019
-1.0
-1.0
-1.0
-1.0
847.0
co2A
39101.0
5.543950
55.956932
-1.0
-1.0
-1.0
-1.0
713.0
comb08
39101.0
20.323828
6.882807
7.0
17.0
20.0
23.0
136.0
combA08
39101.0
0.631160
4.395797
0.0
0.0
0.0
0.0
133.0
engId
39101.0
8582.377382
17606.675590
0.0
0.0
202.0
4401.0
69102.0
feScore
39101.0
0.122580
2.516348
-1.0
-1.0
-1.0
-1.0
10.0
fuelCost08
39101.0
2242.470781
601.273869
500.0
1850.0
2250.0
2500.0
6850.0
fuelCostA08
39101.0
91.335260
479.485802
0.0
0.0
0.0
0.0
3850.0
ghgScore
39101.0
0.120866
2.512612
-1.0
-1.0
-1.0
-1.0
10.0
ghgScoreA
39101.0
-0.923889
0.651017
-1.0
-1.0
-1.0
-1.0
8.0
highway08
39101.0
24.208588
7.128070
9.0
20.0
24.0
27.0
122.0
highwayA08
39101.0
0.736452
4.694207
0.0
0.0
0.0
0.0
121.0
hlv
39101.0
2.029539
5.959735
0.0
0.0
0.0
0.0
49.0
hpv
39101.0
10.411243
28.167271
0.0
0.0
0.0
0.0
195.0
id
39101.0
19662.541188
11413.329199
1.0
9776.0
19552.0
29555.0
39483.0
lv2
39101.0
1.834812
4.407887
0.0
0.0
0.0
0.0
41.0
lv4
39101.0
6.155930
9.698101
0.0
0.0
0.0
13.0
55.0
pv2
39101.0
13.649574
31.214466
0.0
0.0
0.0
0.0
194.0
pv4
39101.0
33.883711
45.991687
0.0
0.0
0.0
91.0
192.0
range
39101.0
0.500243
9.742080
0.0
0.0
0.0
0.0
335.0
year
39101.0
2000.635406
10.690422
1984.0
1991.0
2001.0
2010.0
2018.0
youSaveSpend
39101.0
-3459.572645
3010.284617
-26500.0
-4750.0
-3500.0
-1500.0
5250.0
phevCity
39101.0
0.094703
2.279478
0.0
0.0
0.0
0.0
97.0
phevHwy
39101.0
0.094269
2.191115
0.0
0.0
0.0
0.0
81.0
phevComb
39101.0
0.094141
2.226500
0.0
0.0
0.0
0.0
88.0
np.iinfo(np.int16)
fueleco[["city08" , "comb08" ]].info(memory_usage= "deep" )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 city08 39101 non-null int64
1 comb08 39101 non-null int64
dtypes: int64(2)
memory usage: 611.1 KB
(
fueleco[["city08" , "comb08" ]]
.assign(
city08= fueleco.city08.astype(np.int16),
comb08= fueleco.comb08.astype(np.int16),
)
.info(memory_usage= "deep" )
)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 city08 39101 non-null int16
1 comb08 39101 non-null int16
dtypes: int16(2)
memory usage: 152.9 KB
fueleco[["make" ]].info(memory_usage= "deep" )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 make 39101 non-null object
dtypes: object(1)
memory usage: 2.4 MB
(
fueleco[["make" ]]
.assign(make= fueleco.make.astype("category" ))
.info(memory_usage= "deep" )
)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39101 entries, 0 to 39100
Data columns (total 1 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 make 39101 non-null category
dtypes: category(1)
memory usage: 89.5 KB
Categorical data
fueleco.select_dtypes(object ).columns
fueleco.drive.nunique()
fueleco.drive.sample(5 , random_state= 42 )
fueleco.drive.isna().mean() * 100
fueleco.drive.value_counts(dropna= False )
drive
Front-Wheel Drive 13653
Rear-Wheel Drive 13284
4-Wheel or All-Wheel Drive 6648
All-Wheel Drive 2401
4-Wheel Drive 1221
NaN 1189
2-Wheel Drive 507
Part-time 4-Wheel Drive 198
Name: count, dtype: int64
top_n = fueleco.make.value_counts().index[:6 ]
(
fueleco
.assign(
make= fueleco.make.where(
fueleco.make.isin(top_n), "Other"
)
)
).make.value_counts(dropna= False )
make
Other 23211
Chevrolet 3900
Ford 3208
Dodge 2557
GMC 2442
Toyota 1976
BMW 1807
Name: count, dtype: int64
fig, ax = plt.subplots(figsize= (10 , 8 ))
top_n = fueleco.make.value_counts().index[:6 ]
(
fueleco
.assign(
make= fueleco.make.where(
fueleco.make.isin(top_n), "Other"
)
)
.make.value_counts(dropna= False )
.plot.bar(ax= ax)
)
fig.savefig("c5-catpan.png" , dpi= 300 )
fig, ax = plt.subplots(figsize= (10 ,8 ))
top_n = fueleco.make.value_counts().index[:6 ]
sns.countplot(
y= "make" ,
data= (
fueleco
.assign(make= fueleco.make.where(fueleco.make.isin(top_n), "Other" ))
)
)
fig.savefig("c5-catsns.png" , dpi= 300 )
fueleco.rangeA.value_counts()
rangeA
290 74
270 56
280 53
310 41
277 38
..
351 1
240/290/290 1
303 1
340-350 1
341 1
Name: count, Length: 216, dtype: int64
fueleco.rangeA
(
fueleco.rangeA.str .extract(r"([^0-9.])" )
.dropna()
.apply (lambda row: "" .join(row), axis= 1 )
.value_counts()
)
/ 280
- 71
Name: count, dtype: int64
set (fueleco.rangeA.apply (type ))
fueleco.rangeA.isna().sum ()
(
fueleco.rangeA
.fillna("0" )
.str .replace("-" , "/" )
.str .split("/" , expand= True )
.astype(float )
.mean(axis= 1 )
.pipe(lambda ser_: pd.cut(ser_, 10 ))
.value_counts()
# .pipe(lambda s_: s_.where(s_< 100))
# .dropna()
# .sort_values()
)
(-0.45, 44.95] 37688
(269.7, 314.65] 559
(314.65, 359.6] 352
(359.6, 404.55] 205
(224.75, 269.7] 181
(404.55, 449.5] 82
(89.9, 134.85] 12
(179.8, 224.75] 9
(44.95, 89.9] 8
(134.85, 179.8] 5
Name: count, dtype: int64
Continuous Data
fueleco.select_dtypes("number" )
fueleco.city08.sample(5 , random_state= 42 )
fueleco.city08.isna().sum ()
fueleco.city08.describe()
count 39101.000000
mean 18.077799
std 6.970672
min 6.000000
25% 15.000000
50% 17.000000
75% 20.000000
max 150.000000
Name: city08, dtype: float64
fig, ax = plt.subplots(figsize= (10 ,8 ))
fueleco.city08.hist(ax= ax, bins= 30 )
fig.savefig(
"c5-conthistpan.png" ,
dpi= 300
)
fig, ax = plt.subplots(figsize= (10 ,8 ))
sns.distplot(fueleco.city08, rug= True , ax= ax)
fig.savefig(
"c5-conthistsns.png" ,
dpi= 300
)
fig, axs = plt.subplots(nrows= 3 , figsize= (10 ,8 ))
sns.boxplot(x= fueleco.city08, ax= axs[0 ])
sns.violinplot(x= fueleco.city08, ax= axs[1 ])
sns.boxenplot(x= fueleco.city08, ax= axs[2 ])
fig.savefig(
"c5-contothersns.png" ,
dpi= 300
)
stats.kstest(fueleco.city08, cdf= "norm" )
KstestResult(statistic=0.9999999990134123, pvalue=0.0)
fig, ax = plt.subplots(figsize= (10 , 8 ))
stats.probplot(fueleco.city08, plot= ax)
fig.savefig(
"c5-conprob.png" ,
dpi= 300
)
Comparing continuous values across categories
mask = fueleco.make.isin(
["Ford" , "Honda" , "Tesla" , "BMW" ]
)
fueleco[mask].groupby("make" ).city08.agg(
["mean" , "std" ]
)
make
BMW
17.817377
7.372907
Ford
16.853803
6.701029
Honda
24.372973
9.154064
Tesla
92.826087
5.538970
g = sns.catplot(
x= "make" , y= "city08" , data= fueleco[mask], kind= "box"
)
(fueleco[mask]
.groupby("make" )
.city08
.count()
)
make
BMW 1807
Ford 3208
Honda 925
Tesla 46
Name: city08, dtype: int64
g = sns.catplot(
x= "make" , y= "city08" , data= fueleco[mask], kind= "box"
)
sns.swarmplot(
x= "make" ,
y= "city08" ,
data= fueleco[mask],
color= "k" ,
size= 1 ,
ax= g.ax,
)
g.ax.figure
g = sns.catplot(
x= "make" ,
y= "city08" ,
data= fueleco[mask],
kind= "box" ,
col= "year" ,
col_order= [2012 , 2014 , 2016 , 2018 ],
col_wrap= 2 ,
)
g = sns.catplot(
x= "make" ,
y= "city08" ,
data= fueleco[mask],
kind= "box" ,
hue= "year" ,
hue_order= [2012 , 2014 , 2016 , 2018 ],
)
(
fueleco[mask]
.groupby("make" )
.city08.agg(["mean" , "std" ])
.style.background_gradient(cmap= "RdBu" , axis= 0 )
)
make
BMW
17.817377
7.372907
Ford
16.853803
6.701029
Honda
24.372973
9.154064
Tesla
92.826087
5.538970
Comparing two continuous columns
fueleco.city08.cov(fueleco.highway08)
fueleco.city08.cov(fueleco.comb08)
fueleco.city08.cov(fueleco.cylinders)
fueleco.city08.corr(fueleco.highway08)
fueleco.city08.corr(fueleco.cylinders)
fig, ax = plt.subplots(figsize= (8 ,8 ))
corr = fueleco[
["city08" , "highway08" , "cylinders" ]
].corr()
mask = np.zeros_like(corr, dtype= np.bool )
mask[np.triu_indices_from(mask)] = True
sns.heatmap(
corr,
mask= mask,
fmt= ".2f" ,
annot= True ,
ax= ax,
cmap= "RdBu" ,
vmin=- 1 ,
vmax= 1 ,
square= True
)
fig, ax = plt.subplots(figsize= (8 ,8 ))
fueleco.plot.scatter(
x= "city08" , y= "highway08" , alpha= 0.1 , ax= ax
)
fig.savefig(
"c5-scatpan.png" ,
dpi= 300 ,
bbox_inches= "tight" ,
)
fig, ax = plt.subplots(figsize= (8 ,8 ))
fueleco.plot.scatter(
x= "city08" , y= "cylinders" , alpha= 0.1 , ax= ax
)
fig, ax = plt.subplots(figsize= (8 ,8 ))
(
fueleco.assign(
cylinders= fueleco.cylinders.fillna(0 )
).plot.scatter(
x= "city08" , y= "cylinders" , alpha= 0.1 , ax= ax
)
)
res = sns.lmplot(
x= "city08" , y= "highway08" , data= fueleco
)
res = sns.relplot(
x= "city08" ,
y= "highway08" ,
data= fueleco.assign(
cylinders= fueleco.cylinders.fillna(0 )
),
hue= "year" ,
size= "barrels08" ,
alpha= 0.5 ,
height= 8 ,
)
res = sns.relplot(
x= "city08" ,
y= "highway08" ,
data= fueleco.assign(
cylinders= fueleco.cylinders.fillna(0 )
),
hue= "year" ,
size= "barrels08" ,
alpha= 0.5 ,
height= 8 ,
col= "make" ,
col_order= ["Ford" , "Tesla" ]
)
fueleco.city08.corr(
fueleco.barrels08, method= "spearman"
)
Comparing categorical values with categorical value_counts
def generalize(ser, match_name, default):
seen = None
for match, name in match_name:
mask = ser.str .contains(match)
if seen is None :
seen = mask
else :
seen |= mask
ser = ser.where(~ mask, name)
ser = ser.where(seen, default)
return ser
makes = ["Ford" , "Tesla" , "BMW" , "Toyota" ]
data = fueleco[fueleco.make.isin(makes)].assign(
SClass= lambda df_: generalize(
df_.VClass, [
("Seaters" , "Car" ),
("Car" , "Car" ),
("Utility" , "SUV" ),
("Truck" , "Truck" ),
("Van" , "Van" ),
("van" , "Van" ),
("Wagon" , "Wagon" ),
],
"other" ,
)
)
data.groupby(["make" , "SClass" ]).size().unstack()
pd.crosstab(data.make, data.SClass)
make
BMW
1557
158
0
0
92
0
Ford
1075
372
1040
332
155
234
Tesla
36
10
0
0
0
0
Toyota
773
376
478
94
132
123
pd.crosstab(
[data.year, data.make], [data.SClass, data.VClass]
)
VClass
Compact Cars
Large Cars
Midsize Cars
Minicompact Cars
Subcompact Cars
Two Seaters
Vans, Cargo Type
Small Sport Utility Vehicle 2WD
Small Sport Utility Vehicle 4WD
Sport Utility Vehicle - 2WD
Sport Utility Vehicle - 4WD
Standard Sport Utility Vehicle 2WD
Standard Sport Utility Vehicle 4WD
Small Pickup Trucks
Small Pickup Trucks 2WD
Small Pickup Trucks 4WD
Standard Pickup Trucks
Standard Pickup Trucks 2WD
Standard Pickup Trucks 4WD
Minivan - 2WD
Minivan - 4WD
Vans
Vans, Passenger Type
Midsize Station Wagons
Midsize-Large Station Wagons
Small Station Wagons
Special Purpose Vehicle 2WD
Special Purpose Vehicle 4WD
Special Purpose Vehicles
1984
BMW
6
0
0
0
10
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Ford
33
3
9
0
14
6
26
0
0
0
0
0
0
0
18
11
0
32
26
0
0
0
12
5
2
12
0
21
6
Toyota
13
0
0
3
22
0
4
0
0
0
0
0
0
0
0
0
0
13
3
0
0
0
4
0
0
10
0
3
2
1985
BMW
7
0
0
0
6
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Ford
31
2
5
0
13
3
0
0
0
0
0
0
0
16
0
0
54
0
0
0
0
26
0
0
3
13
5
9
6
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
2017
Tesla
0
8
0
0
0
0
0
0
0
0
0
0
5
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
Toyota
3
0
16
2
2
0
0
5
3
0
0
2
10
0
2
5
0
3
3
1
1
0
0
1
0
0
0
0
0
2018
BMW
37
12
5
0
25
1
0
0
2
0
0
2
5
0
0
0
0
0
0
0
0
0
0
0
0
2
0
0
0
Ford
0
0
8
0
0
0
0
4
2
0
0
1
2
0
1
0
0
12
14
0
0
0
3
0
0
0
6
0
0
Toyota
4
0
16
0
2
0
0
2
3
0
0
2
5
0
0
0
0
2
3
0
0
0
0
0
0
0
0
0
0
111 rows × 29 columns
fig, ax = plt.subplots(figsize= (10 ,8 ))
(
data.pipe(
lambda df_: pd.crosstab(df_.make, df_.SClass)
).plot.bar(ax= ax)
)
res = sns.catplot(
kind= "count" , x= "make" , hue= "SClass" , data= data
)
fig, ax = plt.subplots(figsize= (10 ,8 ))
(
data.pipe(
lambda df_: pd.crosstab(df_.make, df_.SClass)
)
.pipe(lambda df_: df_.div(df_.sum (axis= 1 ), axis= 0 ))
.plot.bar(stacked= True , ax= ax)
)
Using the pandas profiling library
For compatibility with pydantic v2 need to pip install ydata-profiling and then import ydata-profiling as pp
report = pp.ProfileReport(fueleco)
report.to_file("fueleco.html" )